In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from scipy.stats import boxcox
from sklearn.preprocessing import RobustScaler
import missingno as msno
In [2]:
pipeline_df = pd.read_csv(r'C:\Users\mmotd\OneDrive\Documents\Boot Camp Files\Shiny Project\Shiny_Project\Pipeline_Incidents.csv')

Assessing missingness patterns in pipeline dataframe and handling them accordingly

In [3]:
missing_data = pipeline_df.columns[pipeline_df.isnull().any()].tolist()

msno.matrix(pipeline_df[missing_data], labels = True)
Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x2073996c880>

Imputing 0 for incidents where Nan represents that no injuries/fatalities occurred

In [4]:
def imputation():
    pipeline_df.loc[:,'Public Evacuations':'Other Costs'] = pipeline_df.loc[:,
                                                                   'Public Evacuations':'Other Costs'].fillna(0)
    pipeline_df.loc[:, 
                'Pipeline/Facility Name':'Accident State'] = pipeline_df.loc[:, 
                                                                'Pipeline/Facility Name':'Accident State'].fillna('Unknown')
    pipeline_df.loc[:, 'Intentional Release (Barrels)'] = pipeline_df.loc[:, 
                                                                'Unintentional Release (Barrels)'].fillna(0)
    pipeline_df['Pipeline Shutdown'] = pipeline_df['Pipeline Shutdown'].fillna('NO')
    shutdown_null = pipeline_df.loc[:,['Shutdown Date/Time', 
                                   'Restart Date/Time']][pipeline_df['Shutdown Date/Time'].isna()].index
    pipeline_df.loc[shutdown_null,['Shutdown Date/Time', 'Restart Date/Time']] = pipeline_df.loc[shutdown_null, 
                                                                                            'Accident Date/Time']
    shutdown_null = pipeline_df.loc[:,['Shutdown Date/Time', 
                                   'Restart Date/Time']][pipeline_df['Restart Date/Time'].isna()].index
    pipeline_df.loc[shutdown_null,['Shutdown Date/Time', 'Restart Date/Time']] = pipeline_df.loc[shutdown_null, 
                                                                                            'Accident Date/Time']
    return pipeline_df
In [5]:
pipeline_df = imputation()
In [6]:
pipeline_df.loc[:,['Accident Date/Time','Shutdown Date/Time',
       'Restart Date/Time']] = pipeline_df.loc[:,['Accident Date/Time','Shutdown Date/Time',
       'Restart Date/Time']].apply(lambda x: pd.to_datetime(x), axis = 1)
In [7]:
pipeline_df = pipeline_df.set_index('Accident Date/Time')
In [8]:
monthly_incidents = pd.DataFrame(pipeline_df.groupby(pd.Grouper(freq = 'm'))['Report Number'].count())
In [9]:
monthly_incidents['year'] = monthly_incidents.index.year
In [10]:
monthly_incidents.columns = ['incidents_num', 'year']
In [11]:
yearly_incidents = pd.DataFrame(pipeline_df.groupby(pd.Grouper(freq = 'y'))['Report Number'].count())
In [12]:
yearly_incidents.index = yearly_incidents.index.year
yearly_incidents = yearly_incidents.loc['2010':'2016',:]
In [13]:
yearly_incidents.columns = ['num_incidents']
In [14]:
yearly_graph = px.line(yearly_incidents, x = yearly_incidents.index, 
                       y= 'num_incidents', title = 'Yearly Number of Incidents')
yearly_graph.update_xaxes(title = 'Year')
yearly_graph.update_yaxes(title = '# of Incidents')
2010201120122013201420152016340360380400420440460
Yearly Number of IncidentsYear# of Incidents

The trend of incidents shows an exponential upwards trend between 2011 and 2014. From 2014 to 2015, there is a slight upwards linear trend followed by a decresing linear trend in 2016. This may suggest the implemetation of more robust reliability systems such as predictive maintenance or leak detection systems.

What are the growth trends of accident categories

In [15]:
yearly_category = pipeline_df.groupby([pd.Grouper(freq = 'y'), 
                                          'Cause Category'])['Report Number'].count()
In [16]:
yearly_category = yearly_category.reset_index().set_index('Accident Date/Time')
yearly_category.index = yearly_category.index.year
yearly_category = yearly_category.loc[:2016, :]
In [17]:
yearly_cause = px.line(yearly_category, x = yearly_category.index, 
                       y= 'Report Number', title = 'Yearly Number of Incidents by Cause Category', color = 'Cause Category')
yearly_cause.update_xaxes(title = 'Year')
yearly_cause.update_yaxes(title = '# of Incidents')
2010201120122013201420152016050100150200250
Cause CategoryALL OTHER CAUSESCORROSIONEXCAVATION DAMAGEINCORRECT OPERATIONMATERIAL/WELD/EQUIP FAILURENATURAL FORCE DAMAGEOTHER OUTSIDE FORCE DAMAGEYearly Number of Incidents by Cause CategoryYear# of Incidents

Delving deeper into the primary causes of pipeline incidents over time reveals that Material/Weld/Equipment failures followed by corrosion are the primary culprits. This can further be mitigated by by more robust reliability measures

In [18]:
px.histogram(pipeline_df, 
             y = 'Cause Category', 
             title = 'Total Number of Incidents by Cause Category').update_yaxes(categoryorder = 'total ascending')
0200400600800100012001400OTHER OUTSIDE FORCE DAMAGEEXCAVATION DAMAGENATURAL FORCE DAMAGEALL OTHER CAUSESINCORRECT OPERATIONCORROSIONMATERIAL/WELD/EQUIP FAILURE
Total Number of Incidents by Cause CategorycountCause Category
In [19]:
causes_df = pipeline_df.loc[:,['Cause Category', 'Cause Subcategory']]
In [20]:
causes_df = causes_df[(causes_df['Cause Category'] == 'CORROSION') | (causes_df['Cause Category'] == 'MATERIAL/WELD/EQUIP FAILURE')]
In [21]:
px.histogram(causes_df, y = 'Cause Subcategory', 
             title = 'Total Number of Incidents by Cause Subcategory').update_yaxes(categoryorder = 'total ascending')
0100200300ENVIRONMENTAL CRACKING-RELATEDDEFECTIVE OR LOOSE TUBING/FITTINGFAILURE OF EQUIPMENT BODYMANUFACTURING-RELATEDCONSTRUCTION, INSTALLATION OR FABRICATION-RELATEDTHREADED CONNECTION/COUPLING FAILUREMALFUNCTION OF CONTROL/RELIEF EQUIPMENTOTHER EQUIPMENT FAILUREEXTERNALNON-THREADED CONNECTION FAILUREPUMP OR PUMP-RELATED EQUIPMENTINTERNAL
Total Number of Incidents by Cause SubcategorycountCause Subcategory

Some of the most common modes of failure are related to corrosion and mechanical/coupling related equipment failures. It would be useful to assess if the modes of failure have a seasonality aspect.

In [22]:
equip_failure = pipeline_df[pipeline_df['Cause Category'] == 'MATERIAL/WELD/EQUIP FAILURE']
In [23]:
equip_failure = equip_failure.groupby([pd.Grouper(freq = 'm')])['Report Number'].count()
In [24]:
equip_failure.index = equip_failure.index.strftime('%B-%y')
In [25]:
equip_seasonal = seasonal_decompose(equip_failure, model = 'Multiplicative', period = 12)
equip_seasonal.plot()
Out[25]:
In [26]:
seasonal_equipment = px.histogram(equip_failure, x = equip_failure.index, 
             y= equip_seasonal.seasonal,histfunc = 'avg', nbins= 120, title = 'Seasonality of Equipment/Seal Failure')
seasonal_equipment.update_yaxes(title = 'Average Change in Incidents by Month')
seasonal_equipment.show()
January-10March-10May-10July-10September-10November-10January-11March-11May-11July-11September-11November-11January-12March-12May-12July-12September-12November-12January-13March-13May-13July-13September-13November-13January-14March-14May-14July-14September-14November-14January-15March-15May-15July-15September-15November-15January-16March-16May-16July-16September-16November-16−6−4−202468
Seasonality of Equipment/Seal FailureAccident Date/TimeAverage Change in Incidents by Month

It would appear that most equipment related failures occur during the transition from winter to spring. This could be caused by change in temperature between seasons. Mechanical equipments contracts in the winter and expands with heat. As the equipment stabilizes to temperature, the incidence rate decreases.

In [27]:
yearly_costs = pd.DataFrame(pipeline_df.groupby([pd.Grouper(freq = 'y')])['All Costs'].sum())
In [28]:
yearly_costs = yearly_costs[:2016]
In [29]:
yearly_costs.index = yearly_costs.index.year
In [30]:
yearly_equipcosts = pipeline_df[(pipeline_df['Cause Category'] == 'CORROSION') | \
                              (pipeline_df['Cause Category'] == \
                               'MATERIAL/WELD/EQUIP FAILURE')].groupby(pd.Grouper(freq = 'y'))['All Costs'].sum()
In [31]:
yearly_equipcosts.index = yearly_equipcosts.index.year
In [32]:
yearly_costs['Equip Failure Cost'] = yearly_equipcosts
In [33]:
yearly_costs = yearly_costs.loc[:2016,:]
In [34]:
yearly_costs['Ratio'] = yearly_costs['Equip Failure Cost'] / yearly_costs['All Costs']
In [35]:
px.histogram(pipeline_df, x = 'All Costs', title = 'Histogram of Cost Distribution', nbins = 10000).update_yaxes(title = 'Frequency')
010M2M4M6M8M05010203040607080
Histogram of Cost DistributionAll CostsFrequency

Based on the price distribution histogram, it is evident that the data has severe right skew and significant kurtosis. A majority of pipeline incidents yield losses of under 100k USD.

In [36]:
px.histogram(pipeline_df, x = 'All Costs', title = 'Histogram of Cost Distribution', nbins = 10000, color = 'Accident Year')
05M1M2M3M4M6M0204060801030507090
Accident Year20102011201220132014201520162017Histogram of Cost DistributionAll Costscount
In [37]:
px.bar(yearly_costs, x = yearly_costs.index, 
       y = 'Ratio', title = 'Ratio of Equipment/Corrosion Costs to Total Costs').update_xaxes(title = 'Year')
201020112012201320142015201600.10.20.30.40.50.60.70.8
Ratio of Equipment/Corrosion Costs to Total CostsYearRatio
In [38]:
px.histogram(pipeline_df, x = pipeline_df.index.strftime('%B-%y'), y = 'All Costs', title = 'Cost Over Time')
January-10March-10May-10July-10September-10November-10January-11March-11May-11July-11September-11November-11January-12March-12May-12July-12September-12November-12January-13March-13May-13July-13September-13November-13January-14March-14May-14July-14September-14November-14January-15March-15May-15July-15September-15November-15January-16March-16May-16July-16September-16November-16January-170200M400M600M800M
Cost Over Timexsum of All Costs
In [39]:
pipeline_df['Down Time'] = (pipeline_df['Restart Date/Time'] - pipeline_df['Shutdown Date/Time'])
pipeline_df['Down Time'] = pipeline_df['Down Time'] / np.timedelta64(1, 'h')
In [40]:
px.histogram(pipeline_df, x = 'Down Time', nbins = 123, 
             title = 'Distribution of Downtime').update_xaxes(title = 'Hours').update_yaxes(title = 'Frequency')
05001000150020002500300035004000450001000200400600800
Distribution of DowntimeHoursFrequency
In [41]:
px.histogram(pipeline_df, 
             x = 'Accident State', 
             title = 'Distribution of Incidents by State').update_xaxes(categoryorder = \
                                                                        'total descending').update_yaxes(title = 'Frequency')
TXOKLACAKSILWYNJMNNMINOHNDIAPAMOMSMTMIWICONEALVAGASCNCKYNYARUnknownAKSDMDUTWATNFLORIDHIMAWVCTPRNVME02004006008001000
Distribution of Incidents by StateAccident StateFrequency
In [42]:
px.histogram(pipeline_df, x = 'Net Loss (Barrels)', 
             title = 'Distribution of Net Product Loss').update_yaxes(title = 'Frequency')
020040060080010001200140016001800050100150200250300
Distribution of Net Product LossNet Loss (Barrels)Frequency
In [43]:
operator_counts = pipeline_df.groupby(['Operator Name'])['Report Number'].count().sort_values(ascending = False).head(20)
In [44]:
px.histogram(operator_counts, y = operator_counts.index, 
             x = operator_counts, title = \
             'Top 20 Operators with Highest Incident Rate'\
            ).update_yaxes(categoryorder = 'total ascending').update_xaxes(title = 'Total Number of Incidents')
050100150200CONOCOPHILLIPSCHEVRON PIPE LINE COKOCH PIPELINE COMPANY, L.P.ONEOK NGL PIPELINE, LLCMAGELLAN AMMONIA PIPELINE, L.P.ONEOK NGL PIPELINE LPEXPLORER PIPELINE COEXXONMOBIL PIPELINE COSHELL PIPELINE CO., L.P.ENBRIDGE ENERGY, LIMITED PARTNERSHIPPHILLIPS 66 PIPELINE LLCKINDER MORGAN LIQUID TERMINALS, LLCMARATHON PIPE LINE LLCBUCKEYE PARTNERS, LPCOLONIAL PIPELINE COMAGELLAN PIPELINE COMPANY, LPENTERPRISE PRODUCTS OPERATING LLCPLAINS PIPELINE, L.P.SUNOCO PIPELINE L.P.ENTERPRISE CRUDE PIPELINE LLC
Top 20 Operators with Highest Incident RateTotal Number of IncidentsOperator Name
In [45]:
operators_price = pipeline_df.groupby(['Operator Name'])['All Costs'].sum().sort_values(ascending = False).head(20)
In [46]:
px.histogram(operators_price, y = operators_price.index, 
             x = operators_price, title = \
             'Top 20 Operators with Highest Incident Costs'\
            ).update_yaxes(categoryorder = 'total ascending').update_xaxes(title = 'Total Cost in Dollars')
0200M400M600M800MTESORO HIGH PLAINS PIPELINE COMPANY LLCMID - VALLEY PIPELINE COEXPLORER PIPELINE COKOCH PIPELINE COMPANY, L.P.BUCKEYE PARTNERS, LPCHEVRON PETROCHEMICAL PIPELINE LLCMAGELLAN PIPELINE COMPANY, LPENTERPRISE CRUDE PIPELINE LLCENTERPRISE PRODUCTS OPERATING LLCSUNOCO PIPELINE L.P.SHELL PIPELINE CO., L.P.ALYESKA PIPELINE SERVICE COWEST SHORE PIPELINE COMARATHON PIPE LINE LLCMOBIL PIPE LINE COMPANYCHEVRON PIPE LINE COCOLONIAL PIPELINE COEXXONMOBIL PIPELINE COPLAINS PIPELINE, L.P.ENBRIDGE ENERGY, LIMITED PARTNERSHIP
Top 20 Operators with Highest Incident CostsTotal Cost in DollarsOperator Name